import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("PEP1.csv")
df.head()
| Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
| 1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
| 2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
| 3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
| 4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
# Dropping Id column
df.drop(['Id'], axis=1,inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 1460 non-null int64 1 MSSubClass 1460 non-null int64 2 MSZoning 1460 non-null object 3 LotFrontage 1201 non-null float64 4 LotArea 1460 non-null int64 5 Street 1460 non-null object 6 Alley 91 non-null object 7 LotShape 1460 non-null object 8 LandContour 1460 non-null object 9 Utilities 1460 non-null object 10 LotConfig 1460 non-null object 11 LandSlope 1460 non-null object 12 Neighborhood 1460 non-null object 13 Condition1 1460 non-null object 14 Condition2 1460 non-null object 15 BldgType 1460 non-null object 16 HouseStyle 1460 non-null object 17 OverallQual 1460 non-null int64 18 OverallCond 1460 non-null int64 19 YearBuilt 1460 non-null int64 20 YearRemodAdd 1460 non-null int64 21 RoofStyle 1460 non-null object 22 RoofMatl 1460 non-null object 23 Exterior1st 1460 non-null object 24 Exterior2nd 1460 non-null object 25 MasVnrType 1452 non-null object 26 MasVnrArea 1452 non-null float64 27 ExterQual 1460 non-null object 28 ExterCond 1460 non-null object 29 Foundation 1460 non-null object 30 BsmtQual 1423 non-null object 31 BsmtCond 1423 non-null object 32 BsmtExposure 1422 non-null object 33 BsmtFinType1 1423 non-null object 34 BsmtFinSF1 1460 non-null int64 35 BsmtFinType2 1422 non-null object 36 BsmtFinSF2 1460 non-null int64 37 BsmtUnfSF 1460 non-null int64 38 TotalBsmtSF 1460 non-null int64 39 Heating 1460 non-null object 40 HeatingQC 1460 non-null object 41 CentralAir 1460 non-null object 42 Electrical 1459 non-null object 43 1stFlrSF 1460 non-null int64 44 2ndFlrSF 1460 non-null int64 45 LowQualFinSF 1460 non-null int64 46 GrLivArea 1460 non-null int64 47 BsmtFullBath 1460 non-null int64 48 BsmtHalfBath 1460 non-null int64 49 FullBath 1460 non-null int64 50 HalfBath 1460 non-null int64 51 BedroomAbvGr 1460 non-null int64 52 KitchebvGr 1460 non-null int64 53 KitchenQual 1460 non-null object 54 TotRmsAbvGrd 1460 non-null int64 55 Functiol 1460 non-null object 56 Fireplaces 1460 non-null int64 57 FireplaceQu 770 non-null object 58 GarageType 1379 non-null object 59 GarageYrBlt 1379 non-null float64 60 GarageFinish 1379 non-null object 61 GarageCars 1460 non-null int64 62 GarageArea 1460 non-null int64 63 GarageQual 1379 non-null object 64 GarageCond 1379 non-null object 65 PavedDrive 1460 non-null object 66 WoodDeckSF 1460 non-null int64 67 OpenPorchSF 1460 non-null int64 68 EnclosedPorch 1460 non-null int64 69 3SsnPorch 1460 non-null int64 70 ScreenPorch 1460 non-null int64 71 PoolArea 1460 non-null int64 72 PoolQC 7 non-null object 73 Fence 281 non-null object 74 MiscFeature 54 non-null object 75 MiscVal 1460 non-null int64 76 MoSold 1460 non-null int64 77 YrSold 1460 non-null int64 78 SaleType 1460 non-null object 79 SaleCondition 1460 non-null object 80 SalePrice 1460 non-null int64 dtypes: float64(3), int64(35), object(43) memory usage: 924.0+ KB
df.shape
(1460, 80)
Pandas doesn't print all columns by default. If you want to permanently set Pandas print options to display the output of all columns. you can use any of these options for a TEMPORARY solution:
null_values = pd.isnull(df).sum()[pd.isnull(df).sum()>0]
null_values
LotFrontage 259 Alley 1369 MasVnrType 8 MasVnrArea 8 BsmtQual 37 BsmtCond 37 BsmtExposure 38 BsmtFinType1 37 BsmtFinType2 38 Electrical 1 FireplaceQu 690 GarageType 81 GarageYrBlt 81 GarageFinish 81 GarageQual 81 GarageCond 81 PoolQC 1453 Fence 1179 MiscFeature 1406 dtype: int64
def plot_nas(df: pd.DataFrame):
if df.isnull().sum().sum() != 0:
na_df = (df.isnull().sum() / len(df))
na_df = na_df.drop(na_df[na_df == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Variables with null values' :na_df})
missing_data.plot(kind = "barh")
plt.show()
else:
print('No NAs found')
plot_nas(df)
for col in df:
print(col, df[col].nunique())
MSSubClass 15 MSZoning 5 LotFrontage 110 LotArea 1073 Street 2 Alley 2 LotShape 4 LandContour 4 Utilities 2 LotConfig 5 LandSlope 3 Neighborhood 25 Condition1 9 Condition2 8 BldgType 5 HouseStyle 8 OverallQual 10 OverallCond 9 YearBuilt 112 YearRemodAdd 61 RoofStyle 6 RoofMatl 8 Exterior1st 15 Exterior2nd 16 MasVnrType 4 MasVnrArea 327 ExterQual 4 ExterCond 5 Foundation 6 BsmtQual 4 BsmtCond 4 BsmtExposure 4 BsmtFinType1 6 BsmtFinSF1 637 BsmtFinType2 6 BsmtFinSF2 144 BsmtUnfSF 780 TotalBsmtSF 721 Heating 6 HeatingQC 5 CentralAir 2 Electrical 5 1stFlrSF 753 2ndFlrSF 417 LowQualFinSF 24 GrLivArea 861 BsmtFullBath 4 BsmtHalfBath 3 FullBath 4 HalfBath 3 BedroomAbvGr 8 KitchebvGr 4 KitchenQual 4 TotRmsAbvGrd 12 Functiol 7 Fireplaces 4 FireplaceQu 5 GarageType 6 GarageYrBlt 97 GarageFinish 3 GarageCars 5 GarageArea 441 GarageQual 5 GarageCond 5 PavedDrive 3 WoodDeckSF 274 OpenPorchSF 202 EnclosedPorch 120 3SsnPorch 20 ScreenPorch 76 PoolArea 8 PoolQC 3 Fence 4 MiscFeature 4 MiscVal 21 MoSold 12 YrSold 5 SaleType 9 SaleCondition 6 SalePrice 663
Generate a separate dataset for numerical and categorical variables
nu = df.nunique().reset_index()
nu.columns = ['feature','nunique']
plt.figure(figsize=(7,20))
ax = sns.barplot(y='feature', x='nunique', data=nu)
numeric_data = df.select_dtypes(include=[np.number])
numeric_data.head(3)
| MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | ... | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 60 | 65.0 | 8450 | 7 | 5 | 2003 | 2003 | 196.0 | 706 | 0 | ... | 0 | 61 | 0 | 0 | 0 | 0 | 0 | 2 | 2008 | 208500 |
| 1 | 20 | 80.0 | 9600 | 6 | 8 | 1976 | 1976 | 0.0 | 978 | 0 | ... | 298 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | 2007 | 181500 |
| 2 | 60 | 68.0 | 11250 | 7 | 5 | 2001 | 2002 | 162.0 | 486 | 0 | ... | 0 | 42 | 0 | 0 | 0 | 0 | 0 | 9 | 2008 | 223500 |
3 rows × 37 columns
# Identifying shape of the dataset
numeric_data.shape
(1460, 37)
categorical_data = df.select_dtypes(exclude=[np.number])
categorical_data.head(3)
| MSZoning | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | ... | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | PoolQC | Fence | MiscFeature | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 1 | RL | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 2 | RL | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
3 rows × 43 columns
pd.isnull(numeric_data).sum()[pd.isnull(numeric_data).sum()>0]
LotFrontage 259 MasVnrArea 8 GarageYrBlt 81 dtype: int64
numeric_data['LotFrontage'].fillna(numeric_data['LotFrontage'].mean(), inplace = True)
numeric_data['MasVnrArea'].fillna(numeric_data['MasVnrArea'].mean(), inplace = True)
numeric_data['GarageYrBlt'].fillna(numeric_data['GarageYrBlt'].mean(), inplace = True)
# Checking if there is any null value present
pd.isnull(numeric_data).sum().sum()
0
skew=numeric_data.skew(axis = 0, skipna = True)
print(skew)
Id 0.000000 MSSubClass 1.407657 LotFrontage 2.384950 LotArea 12.207688 OverallQual 0.216944 OverallCond 0.693067 YearBuilt -0.613461 YearRemodAdd -0.503562 MasVnrArea 2.676412 BsmtFinSF1 1.685503 BsmtFinSF2 4.255261 BsmtUnfSF 0.920268 TotalBsmtSF 1.524255 1stFlrSF 1.376757 2ndFlrSF 0.813030 LowQualFinSF 9.011341 GrLivArea 1.366560 BsmtFullBath 0.596067 BsmtHalfBath 4.103403 FullBath 0.036562 HalfBath 0.675897 BedroomAbvGr 0.211790 KitchebvGr 4.488397 TotRmsAbvGrd 0.676341 Fireplaces 0.649565 GarageYrBlt -0.668175 GarageCars -0.342549 GarageArea 0.179981 WoodDeckSF 1.541376 OpenPorchSF 2.364342 EnclosedPorch 3.089872 3SsnPorch 10.304342 ScreenPorch 4.122214 PoolArea 14.828374 MiscVal 24.476794 MoSold 0.212053 YrSold 0.096269 SalePrice 1.882876 dtype: float64
for col in numeric_data:
print(col,skew[col])
plt.figure()
sns.distplot(numeric_data[col])
plt.show()
MSSubClass 1.4076567471495591
LotFrontage 2.3849501679427925
LotArea 12.207687851233496
OverallQual 0.2169439277628693
OverallCond 0.6930674724842182
YearBuilt -0.613461172488183
YearRemodAdd -0.5035620027004709
MasVnrArea 2.676411784729959
BsmtFinSF1 1.685503071910789
BsmtFinSF2 4.255261108933303
BsmtUnfSF 0.9202684528039037
TotalBsmtSF 1.5242545490627664
1stFlrSF 1.3767566220336365
2ndFlrSF 0.8130298163023265
LowQualFinSF 9.011341288465387
GrLivArea 1.3665603560164552
BsmtFullBath 0.596066609663168
BsmtHalfBath 4.103402697955168
FullBath 0.036561558402727165
HalfBath 0.675897448233722
BedroomAbvGr 0.21179009627507137
KitchebvGr 4.488396777072859
TotRmsAbvGrd 0.6763408364355531
Fireplaces 0.6495651830548841
GarageYrBlt -0.6681748227563908
GarageCars -0.3425489297486655
GarageArea 0.17998090674623907
WoodDeckSF 1.5413757571931312
OpenPorchSF 2.3643417403694404
EnclosedPorch 3.08987190371177
3SsnPorch 10.304342032693112
ScreenPorch 4.122213743143115
PoolArea 14.828373640750588
MiscVal 24.476794188821916
MoSold 0.21205298505146022
YrSold 0.09626851386568028
SalePrice 1.8828757597682129
corr = numeric_data.corr()
corr.style.background_gradient(cmap='twilight_shifted').set_precision(2)
| Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchebvGr | TotRmsAbvGrd | Fireplaces | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Id | 1.00 | 0.01 | -0.01 | -0.03 | -0.03 | 0.01 | -0.01 | -0.02 | -0.05 | -0.01 | -0.01 | -0.01 | -0.02 | 0.01 | 0.01 | -0.04 | 0.01 | 0.00 | -0.02 | 0.01 | 0.01 | 0.04 | 0.00 | 0.03 | -0.02 | 0.00 | 0.02 | 0.02 | -0.03 | -0.00 | 0.00 | -0.05 | 0.00 | 0.06 | -0.01 | 0.02 | 0.00 | -0.02 |
| MSSubClass | 0.01 | 1.00 | -0.36 | -0.14 | 0.03 | -0.06 | 0.03 | 0.04 | 0.02 | -0.07 | -0.07 | -0.14 | -0.24 | -0.25 | 0.31 | 0.05 | 0.07 | 0.00 | -0.00 | 0.13 | 0.18 | -0.02 | 0.28 | 0.04 | -0.05 | 0.08 | -0.04 | -0.10 | -0.01 | -0.01 | -0.01 | -0.04 | -0.03 | 0.01 | -0.01 | -0.01 | -0.02 | -0.08 |
| LotFrontage | -0.01 | -0.36 | 1.00 | 0.31 | 0.23 | -0.05 | 0.12 | 0.08 | 0.18 | 0.22 | 0.04 | 0.12 | 0.36 | 0.41 | 0.07 | 0.04 | 0.37 | 0.09 | -0.01 | 0.18 | 0.05 | 0.24 | -0.01 | 0.32 | 0.24 | 0.06 | 0.27 | 0.32 | 0.08 | 0.14 | 0.01 | 0.06 | 0.04 | 0.18 | 0.00 | 0.01 | 0.01 | 0.33 |
| LotArea | -0.03 | -0.14 | 0.31 | 1.00 | 0.11 | -0.01 | 0.01 | 0.01 | 0.10 | 0.21 | 0.11 | -0.00 | 0.26 | 0.30 | 0.05 | 0.00 | 0.26 | 0.16 | 0.05 | 0.13 | 0.01 | 0.12 | -0.02 | 0.19 | 0.27 | -0.02 | 0.15 | 0.18 | 0.17 | 0.08 | -0.02 | 0.02 | 0.04 | 0.08 | 0.04 | 0.00 | -0.01 | 0.26 |
| OverallQual | -0.03 | 0.03 | 0.23 | 0.11 | 1.00 | -0.09 | 0.57 | 0.55 | 0.41 | 0.24 | -0.06 | 0.31 | 0.54 | 0.48 | 0.30 | -0.03 | 0.59 | 0.11 | -0.04 | 0.55 | 0.27 | 0.10 | -0.18 | 0.43 | 0.40 | 0.52 | 0.60 | 0.56 | 0.24 | 0.31 | -0.11 | 0.03 | 0.06 | 0.07 | -0.03 | 0.07 | -0.03 | 0.79 |
| OverallCond | 0.01 | -0.06 | -0.05 | -0.01 | -0.09 | 1.00 | -0.38 | 0.07 | -0.13 | -0.05 | 0.04 | -0.14 | -0.17 | -0.14 | 0.03 | 0.03 | -0.08 | -0.05 | 0.12 | -0.19 | -0.06 | 0.01 | -0.09 | -0.06 | -0.02 | -0.31 | -0.19 | -0.15 | -0.00 | -0.03 | 0.07 | 0.03 | 0.05 | -0.00 | 0.07 | -0.00 | 0.04 | -0.08 |
| YearBuilt | -0.01 | 0.03 | 0.12 | 0.01 | 0.57 | -0.38 | 1.00 | 0.59 | 0.31 | 0.25 | -0.05 | 0.15 | 0.39 | 0.28 | 0.01 | -0.18 | 0.20 | 0.19 | -0.04 | 0.47 | 0.24 | -0.07 | -0.17 | 0.10 | 0.15 | 0.78 | 0.54 | 0.48 | 0.22 | 0.19 | -0.39 | 0.03 | -0.05 | 0.00 | -0.03 | 0.01 | -0.01 | 0.52 |
| YearRemodAdd | -0.02 | 0.04 | 0.08 | 0.01 | 0.55 | 0.07 | 0.59 | 1.00 | 0.18 | 0.13 | -0.07 | 0.18 | 0.29 | 0.24 | 0.14 | -0.06 | 0.29 | 0.12 | -0.01 | 0.44 | 0.18 | -0.04 | -0.15 | 0.19 | 0.11 | 0.62 | 0.42 | 0.37 | 0.21 | 0.23 | -0.19 | 0.05 | -0.04 | 0.01 | -0.01 | 0.02 | 0.04 | 0.51 |
| MasVnrArea | -0.05 | 0.02 | 0.18 | 0.10 | 0.41 | -0.13 | 0.31 | 0.18 | 1.00 | 0.26 | -0.07 | 0.11 | 0.36 | 0.34 | 0.17 | -0.07 | 0.39 | 0.09 | 0.03 | 0.28 | 0.20 | 0.10 | -0.04 | 0.28 | 0.25 | 0.25 | 0.36 | 0.37 | 0.16 | 0.12 | -0.11 | 0.02 | 0.06 | 0.01 | -0.03 | -0.01 | -0.01 | 0.48 |
| BsmtFinSF1 | -0.01 | -0.07 | 0.22 | 0.21 | 0.24 | -0.05 | 0.25 | 0.13 | 0.26 | 1.00 | -0.05 | -0.50 | 0.52 | 0.45 | -0.14 | -0.06 | 0.21 | 0.65 | 0.07 | 0.06 | 0.00 | -0.11 | -0.08 | 0.04 | 0.26 | 0.15 | 0.22 | 0.30 | 0.20 | 0.11 | -0.10 | 0.03 | 0.06 | 0.14 | 0.00 | -0.02 | 0.01 | 0.39 |
| BsmtFinSF2 | -0.01 | -0.07 | 0.04 | 0.11 | -0.06 | 0.04 | -0.05 | -0.07 | -0.07 | -0.05 | 1.00 | -0.21 | 0.10 | 0.10 | -0.10 | 0.01 | -0.01 | 0.16 | 0.07 | -0.08 | -0.03 | -0.02 | -0.04 | -0.04 | 0.05 | -0.09 | -0.04 | -0.02 | 0.07 | 0.00 | 0.04 | -0.03 | 0.09 | 0.04 | 0.00 | -0.02 | 0.03 | -0.01 |
| BsmtUnfSF | -0.01 | -0.14 | 0.12 | -0.00 | 0.31 | -0.14 | 0.15 | 0.18 | 0.11 | -0.50 | -0.21 | 1.00 | 0.42 | 0.32 | 0.00 | 0.03 | 0.24 | -0.42 | -0.10 | 0.29 | -0.04 | 0.17 | 0.03 | 0.25 | 0.05 | 0.19 | 0.21 | 0.18 | -0.01 | 0.13 | -0.00 | 0.02 | -0.01 | -0.04 | -0.02 | 0.03 | -0.04 | 0.21 |
| TotalBsmtSF | -0.02 | -0.24 | 0.36 | 0.26 | 0.54 | -0.17 | 0.39 | 0.29 | 0.36 | 0.52 | 0.10 | 0.42 | 1.00 | 0.82 | -0.17 | -0.03 | 0.45 | 0.31 | -0.00 | 0.32 | -0.05 | 0.05 | -0.07 | 0.29 | 0.34 | 0.31 | 0.43 | 0.49 | 0.23 | 0.25 | -0.10 | 0.04 | 0.08 | 0.13 | -0.02 | 0.01 | -0.01 | 0.61 |
| 1stFlrSF | 0.01 | -0.25 | 0.41 | 0.30 | 0.48 | -0.14 | 0.28 | 0.24 | 0.34 | 0.45 | 0.10 | 0.32 | 0.82 | 1.00 | -0.20 | -0.01 | 0.57 | 0.24 | 0.00 | 0.38 | -0.12 | 0.13 | 0.07 | 0.41 | 0.41 | 0.23 | 0.44 | 0.49 | 0.24 | 0.21 | -0.07 | 0.06 | 0.09 | 0.13 | -0.02 | 0.03 | -0.01 | 0.61 |
| 2ndFlrSF | 0.01 | 0.31 | 0.07 | 0.05 | 0.30 | 0.03 | 0.01 | 0.14 | 0.17 | -0.14 | -0.10 | 0.00 | -0.17 | -0.20 | 1.00 | 0.06 | 0.69 | -0.17 | -0.02 | 0.42 | 0.61 | 0.50 | 0.06 | 0.62 | 0.19 | 0.07 | 0.18 | 0.14 | 0.09 | 0.21 | 0.06 | -0.02 | 0.04 | 0.08 | 0.02 | 0.04 | -0.03 | 0.32 |
| LowQualFinSF | -0.04 | 0.05 | 0.04 | 0.00 | -0.03 | 0.03 | -0.18 | -0.06 | -0.07 | -0.06 | 0.01 | 0.03 | -0.03 | -0.01 | 0.06 | 1.00 | 0.13 | -0.05 | -0.01 | -0.00 | -0.03 | 0.11 | 0.01 | 0.13 | -0.02 | -0.03 | -0.09 | -0.07 | -0.03 | 0.02 | 0.06 | -0.00 | 0.03 | 0.06 | -0.00 | -0.02 | -0.03 | -0.03 |
| GrLivArea | 0.01 | 0.07 | 0.37 | 0.26 | 0.59 | -0.08 | 0.20 | 0.29 | 0.39 | 0.21 | -0.01 | 0.24 | 0.45 | 0.57 | 0.69 | 0.13 | 1.00 | 0.03 | -0.02 | 0.63 | 0.42 | 0.52 | 0.10 | 0.83 | 0.46 | 0.22 | 0.47 | 0.47 | 0.25 | 0.33 | 0.01 | 0.02 | 0.10 | 0.17 | -0.00 | 0.05 | -0.04 | 0.71 |
| BsmtFullBath | 0.00 | 0.00 | 0.09 | 0.16 | 0.11 | -0.05 | 0.19 | 0.12 | 0.09 | 0.65 | 0.16 | -0.42 | 0.31 | 0.24 | -0.17 | -0.05 | 0.03 | 1.00 | -0.15 | -0.06 | -0.03 | -0.15 | -0.04 | -0.05 | 0.14 | 0.12 | 0.13 | 0.18 | 0.18 | 0.07 | -0.05 | -0.00 | 0.02 | 0.07 | -0.02 | -0.03 | 0.07 | 0.23 |
| BsmtHalfBath | -0.02 | -0.00 | -0.01 | 0.05 | -0.04 | 0.12 | -0.04 | -0.01 | 0.03 | 0.07 | 0.07 | -0.10 | -0.00 | 0.00 | -0.02 | -0.01 | -0.02 | -0.15 | 1.00 | -0.05 | -0.01 | 0.05 | -0.04 | -0.02 | 0.03 | -0.08 | -0.02 | -0.02 | 0.04 | -0.03 | -0.01 | 0.04 | 0.03 | 0.02 | -0.01 | 0.03 | -0.05 | -0.02 |
| FullBath | 0.01 | 0.13 | 0.18 | 0.13 | 0.55 | -0.19 | 0.47 | 0.44 | 0.28 | 0.06 | -0.08 | 0.29 | 0.32 | 0.38 | 0.42 | -0.00 | 0.63 | -0.06 | -0.05 | 1.00 | 0.14 | 0.36 | 0.13 | 0.55 | 0.24 | 0.47 | 0.47 | 0.41 | 0.19 | 0.26 | -0.12 | 0.04 | -0.01 | 0.05 | -0.01 | 0.06 | -0.02 | 0.56 |
| HalfBath | 0.01 | 0.18 | 0.05 | 0.01 | 0.27 | -0.06 | 0.24 | 0.18 | 0.20 | 0.00 | -0.03 | -0.04 | -0.05 | -0.12 | 0.61 | -0.03 | 0.42 | -0.03 | -0.01 | 0.14 | 1.00 | 0.23 | -0.07 | 0.34 | 0.20 | 0.19 | 0.22 | 0.16 | 0.11 | 0.20 | -0.10 | -0.00 | 0.07 | 0.02 | 0.00 | -0.01 | -0.01 | 0.28 |
| BedroomAbvGr | 0.04 | -0.02 | 0.24 | 0.12 | 0.10 | 0.01 | -0.07 | -0.04 | 0.10 | -0.11 | -0.02 | 0.17 | 0.05 | 0.13 | 0.50 | 0.11 | 0.52 | -0.15 | 0.05 | 0.36 | 0.23 | 1.00 | 0.20 | 0.68 | 0.11 | -0.06 | 0.09 | 0.07 | 0.05 | 0.09 | 0.04 | -0.02 | 0.04 | 0.07 | 0.01 | 0.05 | -0.04 | 0.17 |
| KitchebvGr | 0.00 | 0.28 | -0.01 | -0.02 | -0.18 | -0.09 | -0.17 | -0.15 | -0.04 | -0.08 | -0.04 | 0.03 | -0.07 | 0.07 | 0.06 | 0.01 | 0.10 | -0.04 | -0.04 | 0.13 | -0.07 | 0.20 | 1.00 | 0.26 | -0.12 | -0.11 | -0.05 | -0.06 | -0.09 | -0.07 | 0.04 | -0.02 | -0.05 | -0.01 | 0.06 | 0.03 | 0.03 | -0.14 |
| TotRmsAbvGrd | 0.03 | 0.04 | 0.32 | 0.19 | 0.43 | -0.06 | 0.10 | 0.19 | 0.28 | 0.04 | -0.04 | 0.25 | 0.29 | 0.41 | 0.62 | 0.13 | 0.83 | -0.05 | -0.02 | 0.55 | 0.34 | 0.68 | 0.26 | 1.00 | 0.33 | 0.14 | 0.36 | 0.34 | 0.17 | 0.23 | 0.00 | -0.01 | 0.06 | 0.08 | 0.02 | 0.04 | -0.03 | 0.53 |
| Fireplaces | -0.02 | -0.05 | 0.24 | 0.27 | 0.40 | -0.02 | 0.15 | 0.11 | 0.25 | 0.26 | 0.05 | 0.05 | 0.34 | 0.41 | 0.19 | -0.02 | 0.46 | 0.14 | 0.03 | 0.24 | 0.20 | 0.11 | -0.12 | 0.33 | 1.00 | 0.05 | 0.30 | 0.27 | 0.20 | 0.17 | -0.02 | 0.01 | 0.18 | 0.10 | 0.00 | 0.05 | -0.02 | 0.47 |
| GarageYrBlt | 0.00 | 0.08 | 0.06 | -0.02 | 0.52 | -0.31 | 0.78 | 0.62 | 0.25 | 0.15 | -0.09 | 0.19 | 0.31 | 0.23 | 0.07 | -0.03 | 0.22 | 0.12 | -0.08 | 0.47 | 0.19 | -0.06 | -0.11 | 0.14 | 0.05 | 1.00 | 0.48 | 0.48 | 0.22 | 0.22 | -0.29 | 0.02 | -0.08 | -0.01 | -0.03 | 0.01 | -0.00 | 0.47 |
| GarageCars | 0.02 | -0.04 | 0.27 | 0.15 | 0.60 | -0.19 | 0.54 | 0.42 | 0.36 | 0.22 | -0.04 | 0.21 | 0.43 | 0.44 | 0.18 | -0.09 | 0.47 | 0.13 | -0.02 | 0.47 | 0.22 | 0.09 | -0.05 | 0.36 | 0.30 | 0.48 | 1.00 | 0.88 | 0.23 | 0.21 | -0.15 | 0.04 | 0.05 | 0.02 | -0.04 | 0.04 | -0.04 | 0.64 |
| GarageArea | 0.02 | -0.10 | 0.32 | 0.18 | 0.56 | -0.15 | 0.48 | 0.37 | 0.37 | 0.30 | -0.02 | 0.18 | 0.49 | 0.49 | 0.14 | -0.07 | 0.47 | 0.18 | -0.02 | 0.41 | 0.16 | 0.07 | -0.06 | 0.34 | 0.27 | 0.48 | 0.88 | 1.00 | 0.22 | 0.24 | -0.12 | 0.04 | 0.05 | 0.06 | -0.03 | 0.03 | -0.03 | 0.62 |
| WoodDeckSF | -0.03 | -0.01 | 0.08 | 0.17 | 0.24 | -0.00 | 0.22 | 0.21 | 0.16 | 0.20 | 0.07 | -0.01 | 0.23 | 0.24 | 0.09 | -0.03 | 0.25 | 0.18 | 0.04 | 0.19 | 0.11 | 0.05 | -0.09 | 0.17 | 0.20 | 0.22 | 0.23 | 0.22 | 1.00 | 0.06 | -0.13 | -0.03 | -0.07 | 0.07 | -0.01 | 0.02 | 0.02 | 0.32 |
| OpenPorchSF | -0.00 | -0.01 | 0.14 | 0.08 | 0.31 | -0.03 | 0.19 | 0.23 | 0.12 | 0.11 | 0.00 | 0.13 | 0.25 | 0.21 | 0.21 | 0.02 | 0.33 | 0.07 | -0.03 | 0.26 | 0.20 | 0.09 | -0.07 | 0.23 | 0.17 | 0.22 | 0.21 | 0.24 | 0.06 | 1.00 | -0.09 | -0.01 | 0.07 | 0.06 | -0.02 | 0.07 | -0.06 | 0.32 |
| EnclosedPorch | 0.00 | -0.01 | 0.01 | -0.02 | -0.11 | 0.07 | -0.39 | -0.19 | -0.11 | -0.10 | 0.04 | -0.00 | -0.10 | -0.07 | 0.06 | 0.06 | 0.01 | -0.05 | -0.01 | -0.12 | -0.10 | 0.04 | 0.04 | 0.00 | -0.02 | -0.29 | -0.15 | -0.12 | -0.13 | -0.09 | 1.00 | -0.04 | -0.08 | 0.05 | 0.02 | -0.03 | -0.01 | -0.13 |
| 3SsnPorch | -0.05 | -0.04 | 0.06 | 0.02 | 0.03 | 0.03 | 0.03 | 0.05 | 0.02 | 0.03 | -0.03 | 0.02 | 0.04 | 0.06 | -0.02 | -0.00 | 0.02 | -0.00 | 0.04 | 0.04 | -0.00 | -0.02 | -0.02 | -0.01 | 0.01 | 0.02 | 0.04 | 0.04 | -0.03 | -0.01 | -0.04 | 1.00 | -0.03 | -0.01 | 0.00 | 0.03 | 0.02 | 0.04 |
| ScreenPorch | 0.00 | -0.03 | 0.04 | 0.04 | 0.06 | 0.05 | -0.05 | -0.04 | 0.06 | 0.06 | 0.09 | -0.01 | 0.08 | 0.09 | 0.04 | 0.03 | 0.10 | 0.02 | 0.03 | -0.01 | 0.07 | 0.04 | -0.05 | 0.06 | 0.18 | -0.08 | 0.05 | 0.05 | -0.07 | 0.07 | -0.08 | -0.03 | 1.00 | 0.05 | 0.03 | 0.02 | 0.01 | 0.11 |
| PoolArea | 0.06 | 0.01 | 0.18 | 0.08 | 0.07 | -0.00 | 0.00 | 0.01 | 0.01 | 0.14 | 0.04 | -0.04 | 0.13 | 0.13 | 0.08 | 0.06 | 0.17 | 0.07 | 0.02 | 0.05 | 0.02 | 0.07 | -0.01 | 0.08 | 0.10 | -0.01 | 0.02 | 0.06 | 0.07 | 0.06 | 0.05 | -0.01 | 0.05 | 1.00 | 0.03 | -0.03 | -0.06 | 0.09 |
| MiscVal | -0.01 | -0.01 | 0.00 | 0.04 | -0.03 | 0.07 | -0.03 | -0.01 | -0.03 | 0.00 | 0.00 | -0.02 | -0.02 | -0.02 | 0.02 | -0.00 | -0.00 | -0.02 | -0.01 | -0.01 | 0.00 | 0.01 | 0.06 | 0.02 | 0.00 | -0.03 | -0.04 | -0.03 | -0.01 | -0.02 | 0.02 | 0.00 | 0.03 | 0.03 | 1.00 | -0.01 | 0.00 | -0.02 |
| MoSold | 0.02 | -0.01 | 0.01 | 0.00 | 0.07 | -0.00 | 0.01 | 0.02 | -0.01 | -0.02 | -0.02 | 0.03 | 0.01 | 0.03 | 0.04 | -0.02 | 0.05 | -0.03 | 0.03 | 0.06 | -0.01 | 0.05 | 0.03 | 0.04 | 0.05 | 0.01 | 0.04 | 0.03 | 0.02 | 0.07 | -0.03 | 0.03 | 0.02 | -0.03 | -0.01 | 1.00 | -0.15 | 0.05 |
| YrSold | 0.00 | -0.02 | 0.01 | -0.01 | -0.03 | 0.04 | -0.01 | 0.04 | -0.01 | 0.01 | 0.03 | -0.04 | -0.01 | -0.01 | -0.03 | -0.03 | -0.04 | 0.07 | -0.05 | -0.02 | -0.01 | -0.04 | 0.03 | -0.03 | -0.02 | -0.00 | -0.04 | -0.03 | 0.02 | -0.06 | -0.01 | 0.02 | 0.01 | -0.06 | 0.00 | -0.15 | 1.00 | -0.03 |
| SalePrice | -0.02 | -0.08 | 0.33 | 0.26 | 0.79 | -0.08 | 0.52 | 0.51 | 0.48 | 0.39 | -0.01 | 0.21 | 0.61 | 0.61 | 0.32 | -0.03 | 0.71 | 0.23 | -0.02 | 0.56 | 0.28 | 0.17 | -0.14 | 0.53 | 0.47 | 0.47 | 0.64 | 0.62 | 0.32 | 0.32 | -0.13 | 0.04 | 0.11 | 0.09 | -0.02 | 0.05 | -0.03 | 1.00 |
# Identifying significant variables using a correlation matrix----> seaborn
# plt.figure(figsize=(25,25))
# sns.heatmap(numeric_data.corr(),annot=True,cmap='Blues')
sns.pairplot(numeric_data,diag_kws={'color':'red'},
plot_kws={'color':'green'})
plt.show()
# for i in numeric_data.columns:
# sns.boxplot(numeric_data[i],color='Green')
# plt.show()
# for i in numeric_data.columns:
# q1=numeric_data[i].quantile(0.25)
# q3=numeric_data[i].quantile(0.75)
# iqr=q3-q1
# UB= q3+1.5*iqr
# LB =q1-1.5*iqr
# numeric_data[i]=np.where(numeric_data[i]>UB, UB, numeric_data[i])
# numeric_data[i]=np.where(numeric_data[i]<LB, LB, numeric_data[i])
categorical_data.head(3)
| MSZoning | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | ... | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | PoolQC | Fence | MiscFeature | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | RL | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 1 | RL | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
| 2 | RL | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | ... | Attchd | RFn | TA | TA | Y | NaN | NaN | NaN | WD | Normal |
3 rows × 43 columns
pd.isnull(categorical_data).sum()[pd.isnull(categorical_data).sum()>0]
Alley 1369 MasVnrType 8 BsmtQual 37 BsmtCond 37 BsmtExposure 38 BsmtFinType1 37 BsmtFinType2 38 Electrical 1 FireplaceQu 690 GarageType 81 GarageFinish 81 GarageQual 81 GarageCond 81 PoolQC 1453 Fence 1179 MiscFeature 1406 dtype: int64
# Below code gives percentage of null in every column
null_percentage = categorical_data.isnull().sum()/numeric_data.shape[0]*100
# Below code gives list of columns having more than 60% null
col_to_drop = null_percentage[null_percentage>60].keys()
output_df = categorical_data.drop(col_to_drop, axis=1)
from sklearn.impute import SimpleImputer
value = output_df.values
# defining the imputer
imputer = SimpleImputer(missing_values=np.nan ,strategy='most_frequent')
#for col in cdata:
transformed_values_cate_data = imputer.fit_transform(value)
type(transformed_values_cate_data)
numpy.ndarray
transformed_values_cate_data=pd.DataFrame(transformed_values_cate_data)
transformed_values_cate_data
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | RL | Pave | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | ... | Gd | Typ | Gd | Attchd | RFn | TA | TA | Y | WD | Normal |
| 1 | RL | Pave | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | ... | TA | Typ | TA | Attchd | RFn | TA | TA | Y | WD | Normal |
| 2 | RL | Pave | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | ... | Gd | Typ | TA | Attchd | RFn | TA | TA | Y | WD | Normal |
| 3 | RL | Pave | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | ... | Gd | Typ | Gd | Detchd | Unf | TA | TA | Y | WD | Abnorml |
| 4 | RL | Pave | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | ... | Gd | Typ | TA | Attchd | RFn | TA | TA | Y | WD | Normal |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1455 | RL | Pave | Reg | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | ... | TA | Typ | TA | Attchd | RFn | TA | TA | Y | WD | Normal |
| 1456 | RL | Pave | Reg | Lvl | AllPub | Inside | Gtl | NWAmes | Norm | Norm | ... | TA | Min1 | TA | Attchd | Unf | TA | TA | Y | WD | Normal |
| 1457 | RL | Pave | Reg | Lvl | AllPub | Inside | Gtl | Crawfor | Norm | Norm | ... | Gd | Typ | Gd | Attchd | RFn | TA | TA | Y | WD | Normal |
| 1458 | RL | Pave | Reg | Lvl | AllPub | Inside | Gtl | mes | Norm | Norm | ... | Gd | Typ | Gd | Attchd | Unf | TA | TA | Y | WD | Normal |
| 1459 | RL | Pave | Reg | Lvl | AllPub | Inside | Gtl | Edwards | Norm | Norm | ... | TA | Typ | Gd | Attchd | Fin | TA | TA | Y | WD | Normal |
1460 rows × 39 columns
pd.isnull(transformed_values_cate_data).sum().sum()
# pd.isnull(transformed_values_cate_data).sum()[pd.isnull(transformed_values_cate_data).sum()>0]
0
# col_add=output_df.columns
# col_add
transformed_values_cate_data.columns=output_df.columns
# Viewing datast
transformed_values_cate_data.head(3)
| MSZoning | Street | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | ... | KitchenQual | Functiol | FireplaceQu | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | RL | Pave | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | ... | Gd | Typ | Gd | Attchd | RFn | TA | TA | Y | WD | Normal |
| 1 | RL | Pave | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | ... | TA | Typ | TA | Attchd | RFn | TA | TA | Y | WD | Normal |
| 2 | RL | Pave | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | ... | Gd | Typ | TA | Attchd | RFn | TA | TA | Y | WD | Normal |
3 rows × 39 columns
for col in transformed_values_cate_data:
plt.figure(figsize=(7,5))
sns.countplot(y=transformed_values_cate_data[col])
plt.show()
Y = numeric_data['SalePrice']
for col in transformed_values_cate_data:
plt.figure(figsize=(7,5))
sns.boxplot(x=Y,y=transformed_values_cate_data[col],showcaps=False,
flierprops={"marker": "x"},
boxprops={"facecolor": (.4, .6, .8, .5)},
medianprops={"color": "coral"})
plt.show()
from scipy import stats
for i in transformed_values_cate_data.columns:
x=pd.crosstab(transformed_values_cate_data[i],df.SalePrice)
coeff,pval,dof,exp=stats.chi2_contingency(x.values)
if pval<0.05:
print('H1--Alternate------>Relation exists',i,df.columns[-1],str(pval))
else:
print('H0--Null Hypothsis---- No Relation',i,df.columns[-1],'Dropping the cloumns',str(pval))
transformed_values_cate_data.drop(i,axis=1,inplace=True)
H1--Alternate------>Relation exists MSZoning SalePrice 4.3483250606822396e-11 H1--Alternate------>Relation exists Street SalePrice 8.338870380464053e-09 H1--Alternate------>Relation exists LotShape SalePrice 4.724729155980402e-12 H0--Null Hypothsis---- No Relation LandContour SalePrice Dropping the cloumns 0.08674645041917711 H0--Null Hypothsis---- No Relation Utilities SalePrice Dropping the cloumns 1.0 H1--Alternate------>Relation exists LotConfig SalePrice 0.045806211958033756 H0--Null Hypothsis---- No Relation LandSlope SalePrice Dropping the cloumns 0.10508638737793884 H1--Alternate------>Relation exists Neighborhood SalePrice 1.364960102688296e-08 H0--Null Hypothsis---- No Relation Condition1 SalePrice Dropping the cloumns 1.0 H0--Null Hypothsis---- No Relation Condition2 SalePrice Dropping the cloumns 0.07598640644469955 H0--Null Hypothsis---- No Relation BldgType SalePrice Dropping the cloumns 0.9999860714473023 H0--Null Hypothsis---- No Relation HouseStyle SalePrice Dropping the cloumns 0.6482615179447816 H0--Null Hypothsis---- No Relation RoofStyle SalePrice Dropping the cloumns 1.0 H0--Null Hypothsis---- No Relation RoofMatl SalePrice Dropping the cloumns 1.0 H0--Null Hypothsis---- No Relation Exterior1st SalePrice Dropping the cloumns 0.9999839433628513 H0--Null Hypothsis---- No Relation Exterior2nd SalePrice Dropping the cloumns 0.8469189757654344 H1--Alternate------>Relation exists MasVnrType SalePrice 1.597523579519887e-06 H1--Alternate------>Relation exists ExterQual SalePrice 4.250289171585687e-34 H1--Alternate------>Relation exists ExterCond SalePrice 9.869790306250171e-13 H1--Alternate------>Relation exists Foundation SalePrice 9.66452199704509e-06 H1--Alternate------>Relation exists BsmtQual SalePrice 1.8224981926842858e-22 H1--Alternate------>Relation exists BsmtCond SalePrice 3.2510088142893966e-13 H1--Alternate------>Relation exists BsmtExposure SalePrice 9.898247719408021e-08 H0--Null Hypothsis---- No Relation BsmtFinType1 SalePrice Dropping the cloumns 0.9663365392953103 H0--Null Hypothsis---- No Relation BsmtFinType2 SalePrice Dropping the cloumns 0.999999609869471 H1--Alternate------>Relation exists Heating SalePrice 2.477753304101386e-24 H0--Null Hypothsis---- No Relation HeatingQC SalePrice Dropping the cloumns 0.9995946871632676 H1--Alternate------>Relation exists CentralAir SalePrice 1.2257126695737677e-05 H0--Null Hypothsis---- No Relation Electrical SalePrice Dropping the cloumns 0.052835134010111195 H1--Alternate------>Relation exists KitchenQual SalePrice 1.2820744991685297e-31 H0--Null Hypothsis---- No Relation Functiol SalePrice Dropping the cloumns 1.0 H1--Alternate------>Relation exists FireplaceQu SalePrice 0.0003226273220519131 H0--Null Hypothsis---- No Relation GarageType SalePrice Dropping the cloumns 0.3800804730475992 H1--Alternate------>Relation exists GarageFinish SalePrice 1.5468634379037297e-12 H0--Null Hypothsis---- No Relation GarageQual SalePrice Dropping the cloumns 0.253478687441585 H0--Null Hypothsis---- No Relation GarageCond SalePrice Dropping the cloumns 1.0 H0--Null Hypothsis---- No Relation PavedDrive SalePrice Dropping the cloumns 0.9912799626956017 H1--Alternate------>Relation exists SaleType SalePrice 4.560785392696702e-14 H1--Alternate------>Relation exists SaleCondition SalePrice 5.613395680294345e-14
transformed_values_cate_data
| MSZoning | Street | LotShape | LotConfig | Neighborhood | MasVnrType | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | Heating | CentralAir | KitchenQual | FireplaceQu | GarageFinish | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | RL | Pave | Reg | Inside | CollgCr | BrkFace | Gd | TA | PConc | Gd | TA | No | GasA | Y | Gd | Gd | RFn | WD | Normal |
| 1 | RL | Pave | Reg | FR2 | Veenker | None | TA | TA | CBlock | Gd | TA | Gd | GasA | Y | TA | TA | RFn | WD | Normal |
| 2 | RL | Pave | IR1 | Inside | CollgCr | BrkFace | Gd | TA | PConc | Gd | TA | Mn | GasA | Y | Gd | TA | RFn | WD | Normal |
| 3 | RL | Pave | IR1 | Corner | Crawfor | None | TA | TA | BrkTil | TA | Gd | No | GasA | Y | Gd | Gd | Unf | WD | Abnorml |
| 4 | RL | Pave | IR1 | FR2 | NoRidge | BrkFace | Gd | TA | PConc | Gd | TA | Av | GasA | Y | Gd | TA | RFn | WD | Normal |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1455 | RL | Pave | Reg | Inside | Gilbert | None | TA | TA | PConc | Gd | TA | No | GasA | Y | TA | TA | RFn | WD | Normal |
| 1456 | RL | Pave | Reg | Inside | NWAmes | Stone | TA | TA | CBlock | Gd | TA | No | GasA | Y | TA | TA | Unf | WD | Normal |
| 1457 | RL | Pave | Reg | Inside | Crawfor | None | Ex | Gd | Stone | TA | Gd | No | GasA | Y | Gd | Gd | RFn | WD | Normal |
| 1458 | RL | Pave | Reg | Inside | mes | None | TA | TA | CBlock | TA | TA | Mn | GasA | Y | Gd | Gd | Unf | WD | Normal |
| 1459 | RL | Pave | Reg | Inside | Edwards | None | Gd | TA | CBlock | TA | TA | No | GasA | Y | TA | Gd | Fin | WD | Normal |
1460 rows × 19 columns
# numeric_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 38 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 1460 non-null int64 1 MSSubClass 1460 non-null int64 2 LotFrontage 1460 non-null float64 3 LotArea 1460 non-null int64 4 OverallQual 1460 non-null int64 5 OverallCond 1460 non-null int64 6 YearBuilt 1460 non-null int64 7 YearRemodAdd 1460 non-null int64 8 MasVnrArea 1460 non-null float64 9 BsmtFinSF1 1460 non-null int64 10 BsmtFinSF2 1460 non-null int64 11 BsmtUnfSF 1460 non-null int64 12 TotalBsmtSF 1460 non-null int64 13 1stFlrSF 1460 non-null int64 14 2ndFlrSF 1460 non-null int64 15 LowQualFinSF 1460 non-null int64 16 GrLivArea 1460 non-null int64 17 BsmtFullBath 1460 non-null int64 18 BsmtHalfBath 1460 non-null int64 19 FullBath 1460 non-null int64 20 HalfBath 1460 non-null int64 21 BedroomAbvGr 1460 non-null int64 22 KitchebvGr 1460 non-null int64 23 TotRmsAbvGrd 1460 non-null int64 24 Fireplaces 1460 non-null int64 25 GarageYrBlt 1460 non-null float64 26 GarageCars 1460 non-null int64 27 GarageArea 1460 non-null int64 28 WoodDeckSF 1460 non-null int64 29 OpenPorchSF 1460 non-null int64 30 EnclosedPorch 1460 non-null int64 31 3SsnPorch 1460 non-null int64 32 ScreenPorch 1460 non-null int64 33 PoolArea 1460 non-null int64 34 MiscVal 1460 non-null int64 35 MoSold 1460 non-null int64 36 YrSold 1460 non-null int64 37 SalePrice 1460 non-null int64 dtypes: float64(3), int64(35) memory usage: 433.6 KB
# transformed_values_cate_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MSZoning 1460 non-null object 1 Street 1460 non-null object 2 LotShape 1460 non-null object 3 LotConfig 1460 non-null object 4 Neighborhood 1460 non-null object 5 MasVnrType 1460 non-null object 6 ExterQual 1460 non-null object 7 ExterCond 1460 non-null object 8 Foundation 1460 non-null object 9 BsmtQual 1460 non-null object 10 BsmtCond 1460 non-null object 11 BsmtExposure 1460 non-null object 12 Heating 1460 non-null object 13 CentralAir 1460 non-null object 14 KitchenQual 1460 non-null object 15 FireplaceQu 1460 non-null object 16 GarageFinish 1460 non-null object 17 SaleType 1460 non-null object 18 SaleCondition 1460 non-null object dtypes: object(19) memory usage: 216.8+ KB
# fulldata_set=pd.concat([ndata,aa,data['SalePrice']],axis=1)
result_dataset = pd.concat([numeric_data, transformed_values_cate_data], axis=1)
result_dataset.head(3)
| Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | ... | BsmtQual | BsmtCond | BsmtExposure | Heating | CentralAir | KitchenQual | FireplaceQu | GarageFinish | SaleType | SaleCondition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 60 | 65.0 | 8450 | 7 | 5 | 2003 | 2003 | 196.0 | 706 | ... | Gd | TA | No | GasA | Y | Gd | Gd | RFn | WD | Normal |
| 1 | 2 | 20 | 80.0 | 9600 | 6 | 8 | 1976 | 1976 | 0.0 | 978 | ... | Gd | TA | Gd | GasA | Y | TA | TA | RFn | WD | Normal |
| 2 | 3 | 60 | 68.0 | 11250 | 7 | 5 | 2001 | 2002 | 162.0 | 486 | ... | Gd | TA | Mn | GasA | Y | Gd | TA | RFn | WD | Normal |
3 rows × 57 columns
for i in result_dataset.columns:
if result_dataset[i].dtypes=='int64' or result_dataset[i].dtypes=='float64':
sns.boxplot(result_dataset[i])
plt.show()
else:
plt.figure(figsize=(10,5))
sns.boxplot(x=result_dataset['SalePrice'],y=result_dataset[i])
plt.show()